# install development version of caret library that contains xgboost models
require(caret)
require(xgboost)
require(data.table)
require(vcd)
require(e1071)

# Load Arthritis dataset in memory.
data(Arthritis)
# Create a copy of the dataset with data.table package
# (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent
# and its performance are really good).
df <- data.table(Arthritis, keep.rownames = FALSE)

# Let's add some new categorical features to see if it helps.
# Of course these feature are highly correlated to the Age feature.
# Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features,
# even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age.
# Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[, AgeDiscret := as.factor(round(Age / 10, 0))]

# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old.
# I choose this value based on nothing.
# We will see later if simplifying the information based on arbitrary values is a good strategy
# (I am sure you already have an idea of how well it will work!).
df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]

# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[, ID := NULL]

#-------------Basic Training using XGBoost in caret Library-----------------
# Set up control parameters for caret::train
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 2, search = "random")
# train a xgbTree model using caret::train
model <- train(factor(Improved) ~ ., data = df, method = "xgbTree", trControl = fitControl)

# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model
# using xgbLinear
# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)

# See model results
print(model)
